<<<<<<< HEAD ======= >>>>>>> master
#Importing Data Set
ss14pusa = read.csv(file.choose(),header=TRUE)  # read csv file
ss14pusb = read.csv(file.choose(),header=TRUE)

#Libraries
require(plyr)
require(dplyr)
#Importing Data Set in a faster way
library(data.table)
variables=c("SEX","OCCP","WKHP","MAR","WAGP","SCHL","ESP","RAC1P","ST","PAOC","PWGTP")
ss14pusa=fread("~/Google Drive/Data for R/csv_pus/ss14pusa.csv",head=TRUE,select=variables)
ss14pusb=fread("~/Google Drive/Data for R/csv_pus/ss14pusb.csv",head=TRUE,select=variables)
attach(ss14pusa)
names(ss14pusa)
ss14pusa_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
detach(ss14pusa)

attach(ss14pusb)
ss14pusb_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
detach(ss14pusb)

Data = rbind(ss14pusa_edit,ss14pusb_edit)
colnames(Data) <- c("Gender","Occupation", "Work_hours", "Marriage", "Income", "Education" ,"Parental_Occupation", "Race", "State", "Children", "Weight")


write.csv(Data, file = "Data.csv",row.names=TRUE)
#Delete income=0/NA rows
row_to_keep=which(Data$Income>0)
Data=Data[row_to_keep,]
#Recode Region
#https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#/media/File:Census_Regions_and_Division_of_the_United_States.svg
require(car)
Data$Region <- recode(Data$State,"c(09,23,25,33,44,59,34,36,42)='Northeast';c(17,18,26,38,55,19,20,27,29,31,38,46,39)='Midwest'; c(10,12,13,24,37,45,51,11,54,01,21,28,47,05,22,40,48)='South';c(04,08,16,30,32,35,49,50,02,06,15,41,53,56)='West';'72'='Puerto Rico'")
# Recode Division
Data$Division <- recode(Data$State,"c(53,41,6,2,15)='Pacific';c(4,8,16,30,32,35,49,56)='Mountain'; c(19,20,27,29,31,38,46)='West North Central';c(17,18,26,39,55)='East North Central';c(5,22,40,48)='West South Central';c(1,21,28,47)='East South Central';c(34,36,42)='Middle Atlantic';c(10,11,12,13,24,37,45,51,54)='South Atlantic';c(9,23,25,33,44,50)='New England'")
#Recode Education
Data$Education <- recode(Data$Education,"c(01,02,03,04,05,06,07,08,09,10,11)='~greade8';c(12,13,14,15,16,17,18,19)='grade9~college_nodegree';c(20,21)='associate/bachelor';c(22,23)='master/professional';c(24)='doctor')
#Recode Education
library(car)
Data$Education <- recode(Data$Education,"1:11='~greade8';12:19='grade9~college_nodegree';20:21='associate/bachelor';22:23='master/professional';24='doctor'")
# On Survey Weights
# http://tophcito.blogspot.co.at/2014/04/social-science-goes-r-weighted-survey.html We assign everyone that did make it into the sample a weight. So people that turn out too often in the sample receive a weight of less than 1. And those that we were not able to reach enough of are upweighted with a weight larger than 1. Respondents that belong to groups that have been sampled perfectly receive a weight of 1. This solution is called post-stratification, because it computes weights based on group (or stratum) characteristics, like the distribution of age or gender proportions.

require(survey)
Data.w = svydesign(ids = ~1, data = Data, weights = Data$Weight)
summary(Data.w)

# comparison of the sex ratios in the unweighted and the weighted data frames:
prop.table(table(Data$Gender))
prop.table(svytable(~Gender, design = Data.w))

# Run this to see how it works
table(Data$Gender)
svytable(~Gender, design = Data.w)

# Take my interested variable 'State' as an example, the difference between the unweighted and weighted ratio is really small
prop.table(table(Data$State)) - prop.table(svytable(~State, design = Data.w))

# Ultimately, the difference for each income level between unweighted and weighted is small too
prop.table(table(Data$Income)) - prop.table(svytable(~Income, design = Data.w))
MotherWorking=select(Data_women, Income,Children,Work_hours)
#detach(package:plyr)
GroupedMotherWorking <-
  MotherWorking %>%
  na.omit() %>%
  group_by(Children,Work_hours) %>% 
  summarize(
    AvgIncome = mean(Income),
    count=n()
  ) 

library(plotly)
# note how size is automatically scaled and added as hover text

plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome,size=sqrt(count), color = Children,text = paste("Count: ", count),opacity=1-Children*0.2,mode = "markers")

#plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome, text = paste("Count: ", #count),mode = "markers", opacity=1-Children*0.2,color = Children)

Observations: 1. When work_hours smaller than 60Hrs, Avg Income tend to be positively related to work hour. 2. There are generally more females with no Children 3. Females with Children from 6 to 17 often has higher avg income

MotherOccupation=select(Data_women, Income,Children,Occupation)
#detach(package:plyr)
GroupedMotherOccupation <-
  MotherOccupation %>%
  na.omit() %>%
  group_by(Children,Occupation) %>% 
  summarize(
    AvgIncome = mean(Income),
    count=n()
  ) 


revalue(GroupedMotherOccupation$Children,c('1'='Children under 6','2'='Children from 6 to 17','3'='Children under 6 and above 6','4'='No Children'))

ggplot(GroupedMotherOccupation,aes(x=Children))+geom_point(aes(y=Occupation,size=count,colour=AvgIncome))+scale_colour_gradient(low="green",high="red")+scale_x_continuous(breaks=1:4, labels=c("Children under 6", "Children from 6 to 17", "Children under 6 and above 6", "No Children")) 

#scale_colour_gradientn(colours = terrain.colors(10))

Observations: 1. Females with Children from 6 to 17 usually tend to have higher average income as the dot has darker color. 2. Occupation of ENG, MGR and CMM usually tend to have higher salary. On the other hand, female in LGL field always have lower salary 3.

<<<<<<< HEAD
# Regional Effect
# wanna know how to use survey weights
# calculate median of women in each State and plot in mapview
# Investigate in Occupation or other vavriables to see why

DW=Data[Data$Gender==2, ]

# boxplot of each state/division/region to see distribution
require(plotly)
plot_ly(DW, x = Income, color = Region, type = "box")
plot_ly(DW, x = Income, color = Division, type = "box")
boxplot(DW$Income~DW$State) 

# calculate median of women in each Division/Region/State
library(dplyr)
DivMedian <- aggregate(Income~Division,Data_women, median)
RegMedian <- aggregate(Income~Region,Data_women, median)
StateMedian <- aggregate(Income~State,DW, median)

# Recode State Names
library(readr)
statenames <- read_csv("~/Google Drive/Columbia/5243 ADS/Project 1/data/statenames.csv")
names(StateMedian)[1]<-"code"
State.Median <- merge(StateMedian, statenames, by="code")
head(SM)

# plot state income median in mapview
require(googleVis)
map<-gvisGeoMap(data=State.Median, locationvar = "name", numvar="Income",options=list(region="US",dataMode="regions",width='800px',heigth='600px',gvis.editor = "Income Median of US States", colors="[0xEFEDF5, 0xA64D79]"))
plot(map)
=======
plot_ly(DW, x = Income, color = Division, type = "box")
n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
>>>>>>> master

From the graphs we see that the Northeast region, especially the New England Division, has higher income level than other parts of the States. The next question is why it happens, e.g. particular popular high-income occupations in certain regions?

---
title: "Women in Work Force"
output: html_notebook
---

```{r} 
#Importing Data Set
ss14pusa = read.csv(file.choose(),header=TRUE)  # read csv file
ss14pusb = read.csv(file.choose(),header=TRUE)

#Libraries
require(plyr)
require(dplyr)
```

```{r}
#Importing Data Set in a faster way
library(data.table)
variables=c("SEX","OCCP","WKHP","MAR","WAGP","SCHL","ESP","RAC1P","ST","PAOC","PWGTP")
ss14pusa=fread("~/Google Drive/Data for R/csv_pus/ss14pusa.csv",head=TRUE,select=variables)
ss14pusb=fread("~/Google Drive/Data for R/csv_pus/ss14pusb.csv",head=TRUE,select=variables)

```

```{r}
attach(ss14pusa)
names(ss14pusa)
ss14pusa_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
detach(ss14pusa)

attach(ss14pusb)
ss14pusb_edit = data.frame(SEX,OCCP,WKHP,MAR,WAGP,SCHL,ESP, RAC1P, ST, PAOC, PWGTP)
detach(ss14pusb)

Data = rbind(ss14pusa_edit,ss14pusb_edit)
colnames(Data) <- c("Gender","Occupation", "Work_hours", "Marriage", "Income", "Education" ,"Parental_Occupation", "Race", "State", "Children", "Weight")


write.csv(Data, file = "Data.csv",row.names=TRUE)
```

```{r}
#Delete income=0/NA rows
row_to_keep=which(Data$Income>0)
Data=Data[row_to_keep,]
```

```{r, echo = FALSE}
#recode OCCP
class(Data$Occupation) <- "numeric"
Data$Occupation <- ifelse(Data$Occupation >= 10 & Data$Occupation <= 430, 1, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 1005 & Data$Occupation <= 1240, 2, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 800 & Data$Occupation <= 950, 3, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 2105 & Data$Occupation <= 2160, 4, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 3000 & Data$Occupation <= 3540, 5, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 510 & Data$Occupation <= 740, 6, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 1300 & Data$Occupation <= 1560, 7, Data$Occupation)
Data$Occupation <- ifelse(Data$Occupation >= 1600 & Data$Occupation <= 1965, 8, Data$Occupation)

Data <- filter(Data, Occupation %in% c(1:8))
Data$Occupation <- as.factor(Data$Occupation)

levels(Data$Occupation) <- c('MGR', 'CMM', 'FIN', 'LGL', 'MED' , 'BUS', 'ENG', 'SCI')
```

```{r}
#Recode Region
#https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#/media/File:Census_Regions_and_Division_of_the_United_States.svg
require(car)
Data$Region <- recode(Data$State,"c(09,23,25,33,44,59,34,36,42)='Northeast';c(17,18,26,38,55,19,20,27,29,31,38,46,39)='Midwest'; c(10,12,13,24,37,45,51,11,54,01,21,28,47,05,22,40,48)='South';c(04,08,16,30,32,35,49,50,02,06,15,41,53,56)='West';'72'='Puerto Rico'")
```
```{r}
# Recode Division
Data$Division <- recode(Data$State,"c(53,41,6,2,15)='Pacific';c(4,8,16,30,32,35,49,56)='Mountain'; c(19,20,27,29,31,38,46)='West North Central';c(17,18,26,39,55)='East North Central';c(5,22,40,48)='West South Central';c(1,21,28,47)='East South Central';c(34,36,42)='Middle Atlantic';c(10,11,12,13,24,37,45,51,54)='South Atlantic';c(9,23,25,33,44,50)='New England'")
```

```{r}
#Recode Education
Data$Education <- recode(Data$Education,"c(01,02,03,04,05,06,07,08,09,10,11)='~greade8';c(12,13,14,15,16,17,18,19)='grade9~college_nodegree';c(20,21)='associate/bachelor';c(22,23)='master/professional';c(24)='doctor')

```

```{r}
#Recode Education
library(car)
Data$Education <- recode(Data$Education,"1:11='~greade8';12:19='grade9~college_nodegree';20:21='associate/bachelor';22:23='master/professional';24='doctor'")
```

```{r}
# On Survey Weights
# http://tophcito.blogspot.co.at/2014/04/social-science-goes-r-weighted-survey.html We assign everyone that did make it into the sample a weight. So people that turn out too often in the sample receive a weight of less than 1. And those that we were not able to reach enough of are upweighted with a weight larger than 1. Respondents that belong to groups that have been sampled perfectly receive a weight of 1. This solution is called post-stratification, because it computes weights based on group (or stratum) characteristics, like the distribution of age or gender proportions.

require(survey)
Data.w = svydesign(ids = ~1, data = Data, weights = Data$Weight)
summary(Data.w)

# comparison of the sex ratios in the unweighted and the weighted data frames:
prop.table(table(Data$Gender))
prop.table(svytable(~Gender, design = Data.w))

# Run this to see how it works
table(Data$Gender)
svytable(~Gender, design = Data.w)

# Take my interested variable 'State' as an example, the difference between the unweighted and weighted ratio is really small
prop.table(table(Data$State)) - prop.table(svytable(~State, design = Data.w))

# Ultimately, the difference for each income level between unweighted and weighted is small too
prop.table(table(Data$Income)) - prop.table(svytable(~Income, design = Data.w))

```

```{r, echo = FALSE}
#add in weights...example code from previous project:
require(survey)

df1<-svrepdesign(variables=pus_new[,1:13], 
repweights=pus_new[,14:93], type="BRR",combined.weights=TRUE,
weights=pus_new$PWGTP)
summary(df1)
svymean(~ WAGP,df1, na.rm = T)

#If We only want to know the SE and Mean of Statistics Income?

pus_new$FOD1P[pus_new$FOD1P == 3702 | pus_new$FOD1P == 6212 | pus_new$FOD1P == 6202]<-"Statistics"
df<-subset(pus_new, pus_new$FOD1P=="Statistics")
df2<-svrepdesign(variables=df[,1:13], 
repweights=df[,14:93], type="BRR",combined.weights=TRUE,
weights=df$PWGTP)
summary(df2)
svymean(~ WAGP,df2, na.rm = T)
```

```{r, echo = FALSE}

#median wage for each occupation by gender (just to acknowledge that there *is* a gender gap)
require(ggplot2)
Data_sex_occp <- ddply(Data, .(Gender, Occupation), summarise, MEAN = weighted.mean(Income, Weight, na.rm = T))
ggplot(Data_sex_occp, aes(x=Occupation, y=MEAN, fill=factor(Gender))) + 
  geom_bar(stat="identity",position="dodge") + 
  scale_fill_brewer(palette="RdYlGn") +
  labs(fill="") + 
  ylab("Mean Salary ($)") + 
  xlab("Occupations") + 
  ggtitle(paste("Salary Comparison between Men & Women")) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1), 
        panel.background = element_rect(fill = 'white' )) + 
  theme_grey(base_size = 12)

```

```{r, echo = FALSE}
#subset on women, explore groups of women
Data_women <- filter(Data, Gender == 2)
#Race - Catherine
#Motherhood - 
#Marital Status - Erica (potentially add in region)
#Proportion of graduate degrees in each occupation?
#Region - Vivien

```

```{r, echo = FALSE}
Data_women <- filter(Data, Gender == 2)
MotherHood=select(Data_women, Income,Children)
#detach(package:plyr)
GroupedMotherHood <-
  MotherHood %>%
  na.omit() %>%
  group_by(Children) %>% 
  summarize(
    AvgIncome = mean(Income)
  ) 


### Bar Chart
ggplot(GroupedMotherHood, aes(x=Children, y=AvgIncome)) + 
  geom_bar(colour="black", fill="#DD8888", width=.5, stat="identity") + 
  guides(fill=FALSE) +
  xlab("Children") + ylab("Mean Income") +
  coord_cartesian(ylim = c(50000,70000)) +
  #scale_y_continuous(limits = c(40000,70000),oob = rescale_none) +
  ggtitle("Income for Motherhood")



### 
```

```{r}
MotherWorking=select(Data_women, Income,Children,Work_hours)
#detach(package:plyr)
GroupedMotherWorking <-
  MotherWorking %>%
  na.omit() %>%
  group_by(Children,Work_hours) %>% 
  summarize(
    AvgIncome = mean(Income),
    count=n()
  ) 

library(plotly)
# note how size is automatically scaled and added as hover text

plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome,size=sqrt(count), color = Children,text = paste("Count: ", count),opacity=1-Children*0.2,mode = "markers")

#plot_ly(GroupedMotherWorking, x = Work_hours, y = AvgIncome, text = paste("Count: ", #count),mode = "markers", opacity=1-Children*0.2,color = Children)
```
Observations:
1. When work_hours smaller than 60Hrs, Avg Income tend to be positively related to work hour.
2. There are generally more females with no Children
3. Females with Children from 6 to 17 often has higher avg income
```{r}
MotherOccupation=select(Data_women, Income,Children,Occupation)
#detach(package:plyr)
GroupedMotherOccupation <-
  MotherOccupation %>%
  na.omit() %>%
  group_by(Children,Occupation) %>% 
  summarize(
    AvgIncome = mean(Income),
    count=n()
  ) 


revalue(GroupedMotherOccupation$Children,c('1'='Children under 6','2'='Children from 6 to 17','3'='Children under 6 and above 6','4'='No Children'))

ggplot(GroupedMotherOccupation,aes(x=Children))+geom_point(aes(y=Occupation,size=count,colour=AvgIncome))+scale_colour_gradient(low="green",high="red")+scale_x_continuous(breaks=1:4, labels=c("Children under 6", "Children from 6 to 17", "Children under 6 and above 6", "No Children")) 

#scale_colour_gradientn(colours = terrain.colors(10))
```
Observations:
1. Females with Children from 6 to 17 usually tend to have higher average income as the dot has darker color.
2. Occupation of ENG, MGR and CMM usually tend to have higher salary. On the other hand, female in LGL field always have lower salary
3. 

```{r}
# Regional Effect
# wanna know how to use survey weights
# calculate median of women in each State and plot in mapview
# Investigate in Occupation or other vavriables to see why

DW=Data[Data$Gender==2, ]

# boxplot of each state/division/region to see distribution
require(plotly)
plot_ly(DW, x = Income, color = Region, type = "box")
plot_ly(DW, x = Income, color = Division, type = "box")
boxplot(DW$Income~DW$State) 

# calculate median of women in each Division/Region/State
library(dplyr)
DivMedian <- aggregate(Income~Division,Data_women, median)
RegMedian <- aggregate(Income~Region,Data_women, median)
StateMedian <- aggregate(Income~State,DW, median)

# Recode State Names
library(readr)
statenames <- read_csv("~/Google Drive/Columbia/5243 ADS/Project 1/data/statenames.csv")
names(StateMedian)[1]<-"code"
State.Median <- merge(StateMedian, statenames, by="code")
head(SM)

# plot state income median in mapview
require(googleVis)
map<-gvisGeoMap(data=State.Median, locationvar = "name", numvar="Income",options=list(region="US",dataMode="regions",width='800px',heigth='600px',gvis.editor = "Income Median of US States", colors="[0xEFEDF5, 0xA64D79]"))
plot(map)


```
From the graphs we see that the Northeast region, especially the New England Division, has higher income level than other parts of the States. The next question is why it happens, e.g. particular popular high-income occupations in certain regions?


